rm(list=ls())
library(data.table)
library(tidyverse)
library(rJava)
library(RNetLogo)
library(lhs) # For maximin Latin hypercube sampling
library(ggplot2)
library(plotly) # For beautiful plotting
library(caret)
library(randomForest)
library(factoextra)
library(e1071)
library(TSrepr) # for evaluating predictive power
require(gridExtra)
options(warn = -1)
# Select if data generation is wanted
GenerateTTData <- 0
Is_Headless <- 1
nl.model <- "Segregation"
nl.path <- "C:/Program Files/NetLogo 6.0.4/app"
model.path <- paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/",nl.model,".nlogo")
if (Is_Headless == 0){
NLStart(nl.path, gui = TRUE,nl.jarname='netlogo-6.0.4.jar')
NLLoadModel (model.path)
} else {
NLStart(nl.path, gui = FALSE,nl.jarname='netlogo-6.0.4.jar', nl.obj = nl.model)
NLLoadModel (model.path, nl.obj = nl.model)
#NLStart(nl.path, gui = FALSE,nl.jarname='netlogo-6.0.4.jar', nl.obj = nl.model)
#NLLoadModel (model.path, nl.obj = nl.model )
}
set.seed(1)
## Set model parameters
# Number of replications for each instance
nofrep = 10
# Number of iterations
iteration_budget = 20
# order feature names according to their definition order in run_model
feature_names = c("density","%-similar-wanted")
#
output_name = c("percent-similar")
# Number of input parameters of the agent-based model
nofparams = length(feature_names)
# set RF parameters
ntree = 400
mtry = 2
error_type = "RMSE" # MAPE, BIAS
# choose the uncertainty measure
selection_metric <- "sd" #, "range"
unlabeled_ins = 700
test_ins = 400
train_ins_oneshot = 700
train_ins_Ad = 200
# Set selection parameters
selected_ins = 5 #nofinstancesWillbeSelected in each step
# Set elimination parameters
h <- 1 # number of variables eliminated in each step
#run_model <- function(feature_names,feature_values){ # both should be in character list format
run_model <- function(feature_values){ # both should be in character list format
k = length(feature_names)
for(i in 1:k){
NLCommand(paste0("set ",feature_names[i]," ",feature_values[i]), nl.obj = nl.model)
}
NLCommand("setup", nl.obj = nl.model)
NLDoCommand(100, "go", nl.obj = nl.model)
result <- NLReport(output_name, nl.obj = nl.model)
return(result)
}
#run_replicas <- function(nofrep,feature_names,feature_values) {
run_replicas <- function(nofrep,feature_values) {
replicas = matrix(NA, ncol = nofrep, nrow = 1) # Save the result of each replication
for(i in 1:nofrep){
# replicas[i]= run_model(feature_names,feature_values)
replicas[i]= run_model(feature_values)
}
aggregated_result = mean(replicas)
return(aggregated_result)
}
#run_ABM = function(nofrep,nofinstances,unlabeledset,featurenames = feature_names){
run_ABM = function(nofrep,nofinstances,unlabeledset){
#unlabeledset = setcolorder(unlabeledset,featurenames)
unlabeledset = setcolorder(unlabeledset,feature_names)
for(i in 1:nofinstances){
#unlabeledset[i, output := run_replicas(nofrep,featurenames, as.matrix(unlabeledset[i,]))]
unlabeledset[i, output := run_replicas(nofrep, as.matrix(unlabeledset[i,]))]
}
return(unlabeledset)
}
#error functions on test data
rmse_func <- function(actual, predicted){
error = predicted - actual
return(sqrt(mean(error^2)))
}
mape_func <- function(actual,predicted){
return( (abs(actual - predicted)/ actual)*100 )
}
bias_func <- function(actual,predicted){
return( (actual - predicted)/ actual )
}
#error functions on train data
obb_error_func <- function(model){
if(model$type == "regression"){
oob_error = model$mse[model$ntree]
}else if(model$type == "classification"){
oob_error = model$err.rate
}
return(oob_error)
}
# prediction functions
get_test_predictions <- function(model,testset,errortype){
predictedLabels <- predict(model, testset)
predictedLabels <- cbind(testset,predictedLabels)
setnames(predictedLabels, "predictedLabels","pred_output")
output_variables = colnames(select(predictedLabels, contains("output")))
# output_variables[1] = true output
# output_variables[2] = predicted output
#output_variables = colnames(predictedLabels[,1:(ncol(predictedLabels) - 2)])
if(error_type == "MAPE"){
predictedLabels[,MAPE := mapply(function(x,y) mape_func(x,y),get(output_variables[1]),get(output_variables[2]))]
}
if(error_type == "RMSE"){
predictedLabels[,RMSE := mapply(function(x,y) rmse_func(x,y),get(output_variables[1]),get(output_variables[2]))]
}
if(error_type == "BIAS"){
predictedLabels[,BIAS := mapply(function(x,y) bias_func(x,y),get(output_variables[1]),get(output_variables[2]))]
}
output_variables_1 = predictedLabels[,get(output_variables[1]), with = TRUE]
output_variables_2 = predictedLabels[,get(output_variables[2]), with = TRUE]
performance_temp = matrix(c(1:3), nrow = 1, ncol = 3)
performance_temp[1] = mae(output_variables_1 , output_variables_2)
performance_temp[2] = rmse(output_variables_1 , output_variables_2)
performance_temp[3] = mape(output_variables_1 , output_variables_2)
return(list(predictedLabels,performance_temp,output_variables))
}
# Adaptive sample selection function with an uncertainty measure depending on "selection_metric"
sample_selection <- function(selected_ins,unlabeled_set,model){
ind_pred <- t(predict(model, unlabeled_set,predict.all = TRUE)$individual) %>%
data.table() # predictions by each tree in the forest
ind_pred_eval = data.table()
# standard deviation calculation
s_dev = sapply(ind_pred, sd) %>% data.table()
setnames(s_dev,".","sd")
ind_pred_eval = cbind(ind_pred_eval,s_dev)
# range calculation
range = sapply(ind_pred, range) %>% t() %>% data.table()
range = range[,.(range = abs(range[,1] - range[,2]))]
setnames(range,"range.V1","range")
ind_pred_eval = cbind(ind_pred_eval,range)
ind_pred_eval[,idx := 1:.N]
if(selection_metric == "sd") {
ind_pred_eval = ind_pred_eval[order(-sd)][1:selected_ins]
}else if(selection_metric == "range"){
ind_pred_eval = ind_pred_eval[order(-range)][1:selected_ins]
}
unlabeled_set[,idx := 1:.N]
train_candidates = unlabeled_set[ind_pred_eval$idx]
return(train_candidates)
}
# Random sample selection
random_sample_selection <- function(selected_ins,unlabeled_set){
unlabeled_set[,idx := 1:.N]
train_candidate_idx = sample(unlabeled_set$idx, selected_ins, replace = FALSE, prob = NULL)
train_candidates = unlabeled_set[idx %in% train_candidate_idx]
return(train_candidates)
}
get_variable_importance <- function(model){
importances <- importance(model, type = 1, scale = FALSE)
selected.vars <- order(importances, decreasing = TRUE)
ranked_features = feature_names[selected.vars]
ordered.importances <- importances[selected.vars]
return(ranked_features)
}
feature_elimination <- function(h,total_numof_eliminated_vars,ranked_features){
numof_columns_left = length(ranked_features) - (total_numof_eliminated_vars + h)
columns_left = ranked_features[1:numof_columns_left]
eliminated_columns = setdiff((length(ranked_features) - total_numof_eliminated_vars), numof_columns_left)
eliminated_columns = ranked_features[eliminated_columns]
# update total_numof_eliminated_vars
total_numof_eliminated_vars = length(ranked_features) - length(columns_left)
return(list(columns_left,total_numof_eliminated_vars,h,eliminated_columns))
}
if(GenerateTTData == 1){
unlabeled_pool = as.data.table(maximinLHS(n = unlabeled_ins, k = nofparams, dup = 5))
unlabeled_pool$V1 = qunif(unlabeled_pool$V1, 10, 90)
unlabeled_pool$V2 = qunif(unlabeled_pool$V2, 10, 90)
setnames(unlabeled_pool, c(paste0("V",1:nofparams)), feature_names)
unlabeled_pool[,idx := 1:.N]
fwrite(unlabeled_pool, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/unlabeled_pool_",Sys.Date(),".csv"))
}else{
unlabeled_pool <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/unlabeled_pool_04122019.csv")
unlabeled_pool <- head(unlabeled_pool[`%-similar-wanted` < 90 & `density` < 90],700)
}
pca_unlabeled_pool <- princomp(unlabeled_pool[,-c("idx")], cor = TRUE, scores = TRUE)
pca_unlabeled_pool_components <- get_pca_ind(pca_unlabeled_pool)
p_unlabeled_pool <- ggplot(data = data.table(pca_unlabeled_pool_components$coord[,1:2]), aes(x = Dim.1, y = Dim.2)) +
geom_point() +
labs( title = "")
p_unlabeled_pool
if(GenerateTTData == 1){
test_set <- head(unlabeled_pool,test_ins)
################## Buraya variale'ların datatipine göre bir şeyler yazılabilir
test_set$density = runif(test_ins, 10, 90)
test_set$`%-similar-wanted` = runif(test_ins, 10, 90)
test_set[,c("idx"):= NULL]
print(paste0("ABM run start time : ",Sys.time()))
test_set = run_ABM(nofrep,test_ins,test_set) %>% as.data.table()
print(paste0("ABM run end time : ",Sys.time()))
fwrite(test_set, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/test_set_",Sys.Date(),".csv"))
}else{
test_set <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/test_set_04122019.csv")
#below part is only for this .csv
test_set <- head(test_set[`%-similar-wanted` < 90],800)
test_set[,idx := 1:.N]
test_set_idx = sample(test_set$idx, test_ins, replace = FALSE, prob = NULL)
test_set = test_set[idx %in% test_set_idx]
test_set[,idx:= NULL]
}
10 10 ~ 1 min 100 10 ~ 14 min 900 * 10 ~ 09:16 -- 2019-12-03 07:54:10 +03"
pca_test_set <- princomp(test_set, cor = TRUE, scores = TRUE)
pca_test_set_components <- get_pca_ind(pca_test_set)
p_test_set <- ggplot(data = data.table(pca_test_set_components$coord[,1:2]), aes(x = Dim.1, y = Dim.2)) +
geom_point() +
labs( title = "")
p_test_set
Select a very big data pool ( nofinstances should be very high ) , like 1000
if(GenerateTTData == 1){
training_set = as.data.table(maximinLHS(n = train_ins_oneshot, k = nofparams, dup = 5))
training_set$V1 = qunif(training_set$V1, 10, 90)
training_set$V2 = qunif(training_set$V2, 10, 90)
setnames(training_set, c(paste0("V",1:nofparams)), feature_names)
training_set$output <- 0.00
print(paste0("ABM run start time : ",Sys.time()))
training_set = run_ABM(nofrep,train_ins_oneshot,LHSample) %>% as.data.table()
print(paste0("ABM run end time : ",Sys.time()))
fwrite(training_set, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/training_set_",Sys.Date(),".csv"))
}else{
training_set <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Data_04122019.csv")
training_set <- head(training_set[`%-similar-wanted` < 90],700)
}
one_shot_data = copy(training_set)
pca_training_set <- princomp(training_set[,.SD, .SDcols = !c("output")], cor = TRUE, scores = TRUE)
pca_training_set_components <- get_pca_ind(pca_training_set)
pca_training_set_components <-cbind(pca_training_set_components$coord[,1:2],training_set[,.SD, .SDcols = c("output")])
p_training_set <- ggplot(data = pca_training_set_components, aes(x = Dim.1, y = Dim.2)) +
geom_point(aes(colour = output)) +
labs( title = "", legend = "output")
p_training_set
model_oneshot <- randomForest(x = training_set[, -c("output")], y = training_set$output, importance = TRUE,ntree = ntree, mtry = mtry)
model_oneshot
obb_error_oneshot <- obb_error_func(model_oneshot)
#OBB_pred = cbind(training_set$output,model_oneshot$predicted)
#names(OBB_pred) <- c("actual","predicted")
plot(model_oneshot$mse, type="l")
test_prediction_oneshot = get_test_predictions(model_oneshot,test_set,error_type)
predictedLabels_oneshot = test_prediction_oneshot[[1]]
performance_table_oneshot = data.table(iter = numeric(), mae= numeric(),rmse= numeric(), mape = numeric())
# Keep test set error records
performance_table_oneshot = rbind(performance_table_oneshot, data.table(1, test_prediction_oneshot[[2]]), use.names = FALSE)
output_variables = test_prediction_oneshot[[3]]
performance_table_oneshot
obb_error_oneshot
head(predictedLabels_oneshot)
p_oneshot <- ggplot(predictedLabels_oneshot,aes(x = get(output_variables[1]), y = get(output_variables[2]), color = (get(output_variables[2]) - get(output_variables[1])))) +
geom_point() +
geom_abline() +
xlab("actual values") +
ylab("fitted values")
p_oneshot
Select a relatively big data pool ( nofinstances should be medium) , like 400
if(GenerateTTData == 1){
training_set_Ad = as.data.table(maximinLHS(n = train_ins_Ad, k = nofparams, dup = 5))
training_set_Ad$V1 = qunif(training_set_Ad$V1, 10, 90)
training_set_Ad$V2 = qunif(training_set_Ad$V2, 10, 90)
setnames(training_set_Ad, c(paste0("V",1:nofparams)), feature_names)
training_set_Ad$output <- 0.00
print(paste0("ABM run start time : ",Sys.time()))
training_set_Ad = run_ABM(nofrep,train_ins_Ad,training_set_Ad) %>% as.data.table()
print(paste0("ABM run end time : ",Sys.time()))
fwrite(training_set_Ad, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data",Sys.Date(),".csv"))
}else{
training_set_Ad <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data_04122019.csv")
training_set_Ad <- head(training_set_Ad[`%-similar-wanted` < 90 & `density` < 90],200)
}
adaptive_initial_data = copy(training_set_Ad)
# Decide on strategy:
#iteration_budget = 3 #specified above
## initialize record tables Record train candidates
train_candidates_table = data.table()
# Record model performances
performance_table = data.table(iter = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())
# Record obb_error table
obb_error = data.table(iter = numeric(), obb_error = numeric())
## initialize variables
# keep test set undistorted
predictedLabels_table = copy(test_set)
print(paste0("section start time : ",Sys.time()))
iter = 1
while(iter <= iteration_budget){
print(iter)
trainx = training_set_Ad[,.SD, .SDcols = feature_names]
trainy = training_set_Ad$output
# Train the model
model_Sub <- randomForest( x = trainx, y = trainy,importance = TRUE,ntree = ntree, mtry = mtry)
assign(paste0("model_Sub_",iter),model_Sub)
obb_error = rbind(obb_error,data.table(iter,obb_error_func(model_Sub)),use.names=FALSE)
# test the model on test set
test_predictions_Sub = get_test_predictions(model_Sub,test_set,error_type)
predictedLabels_Sub = test_predictions_Sub[[1]]
setnames(predictedLabels_Sub,c("pred_output",error_type), c(paste0("pred_output_",iter),paste0(error_type,"_",iter)))
predictedLabels_table = cbind(predictedLabels_table,predictedLabels_Sub[,.SD, .SDcols = c(paste0("pred_output_",iter),paste0(error_type,"_",iter))])
# Keep test set error records
performance_table = rbind(performance_table,data.table(iter,test_predictions_Sub[[2]]), use.names = FALSE)
if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached.
## sample selection from unlabeled data select candidates
unlabeled_set <- copy(unlabeled_pool)
train_candidates = random_sample_selection(selected_ins,unlabeled_set)
# Eliminate train candidates from the unlabeled pool
unlabeled_pool = unlabeled_pool[- train_candidates$idx]
rm(unlabeled_set)
# run ABM to find outputs of train candidates
print(paste0("ABM train_candidate run start time : ",Sys.time()))
train_candidates = run_ABM(nofrep,selected_ins,train_candidates)
print(paste0("ABM train_candidate run end time : ",Sys.time()))
train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,iter = iter))
# Add new data to train data
training_set_Ad = rbind(training_set_Ad,train_candidates[,-c("idx")])
}
iter = iter + 1
}
# plot koy her iteration'da göstersin.
#setcolorder(data,variableorder) ################# bunu bi yerlere koyman gerekebilir, dikkat!!
print(paste0("section end time : ",Sys.time()))
started : 2020-01-08 20:13:23 // ended : 2020-01-08 20:31:12 // 10 nofrep 5 sample 19 selection iter = 950 runs
# Final records
FinalTrainData_Rd = copy(training_set_Ad)
performance_table_Rd = copy(performance_table)
train_candidates_table_Rd = copy(train_candidates_table)
predictedLabels_table_Rd = copy(predictedLabels_table)
obb_error_Rd = copy(obb_error)
#fwrite(FinalTrainData_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/FinalTrainData_Rd_BasicCode_",Sys.Date(),".csv") )
#fwrite(performance_table_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/performance_table_Rd_BasicCode_",Sys.Date(),".csv") )
#fwrite(train_candidates_table_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/train_candidates_table_Rd_BasicCode_",Sys.Date(),".csv") )
#fwrite(predictedLabels_table_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/predictedLabels_table_Rdd_BasicCode_",Sys.Date(),".csv") )
#fwrite(obb_error_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/obb_error_Rd_BasicCode_",Sys.Date(),".csv") )
# show results
nrow(FinalTrainData_Rd)
performance_table_Rd
train_candidates_table_Rd
head(predictedLabels_table_Rd)
obb_error_Rd
performance_molten_Rd <- melt(data = performance_table_Rd
, id.vars = 'iter')
setnames(performance_molten_Rd, c("variable","value"),c("errortype","errorvalue"))
p_Rd = ggplot(performance_molten_Rd, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) +
geom_line(lwd=1)
p_Rd
Select a relatively big data pool ( nofinstances should be medium) , like 400
training_set_Ad = copy(adaptive_initial_data)
#if(GenerateTTData == 1){
#
# LHSample_Ad = as.data.table(maximinLHS(n = train_ins_Ad, k = nofparams, dup = 5))
#
# LHSample_Ad$V1 = qunif(LHSample_Ad$V1, 10, 90)
# LHSample_Ad$V2 = qunif(LHSample_Ad$V2, 10, 90)
# setnames(LHSample_Ad, c("V1","V2"), feature_names)
# LHSample_Ad$output <- 0.00
#
# paste0("ABM run start time : ",Sys.time())
# LHSample_Ad = run_ABM(nofrep,train_ins_Ad,LHSample_Ad) %>% as.data.table()
# paste0("ABM run end time : ",Sys.time())
#
# fwrite(LHSample_Ad, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data",Sys.Date(),".csv"))
#
#}else{
# LHSample_Ad <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data_04122019.csv")
# LHSample_Ad <- head(LHSample_Ad[`%-similar-wanted` < 90 & `density` < 90],200)
#
#}
pca_training_set_Ad <- princomp(training_set_Ad[,-c("output")], cor = TRUE, scores = TRUE)
#fviz_pca_ind(pca_LHSample,
# col.ind = "cos2", # Color by the quality of representation
# gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
# geom="point"
# )
pca_training_set_Ad_components <- get_pca_ind(pca_training_set_Ad)
pca_training_set_Ad_components <-cbind(pca_training_set_Ad_components$coord[,1:2],training_set_Ad[,c("output")])
p_training_set_Ad <- ggplot(data = pca_training_set_Ad_components, aes(x = Dim.1, y = Dim.2)) +
geom_point(aes(colour = output)) +
labs( title = "", legend = "output")
p_training_set_Ad
# Decide on strategy:
#iteration_budget = 3
#h = 1 # specify how many variable will be eliminated in each elimination iteration
## initialize record tables Record train candidates
train_candidates_table = data.table()
# Record model performances
performance_table = data.table(iter = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())
# Record obb_error table
obb_error = data.table(iter = numeric(), obb_error = numeric())
## initialize variables
# keep test set undistorted
predictedLabels_table = copy(test_set)
print(paste0("section start time : ",Sys.time()))
iter = 1
while(iter <= iteration_budget){
print(iter)
trainx = training_set_Ad[,.SD, .SDcols = feature_names]
trainy = training_set_Ad$output
# Train the model
model_Sub <- randomForest( x = trainx, y = trainy,importance = TRUE,ntree = ntree, mtry = mtry)
assign(paste0("model_Sub_",iter),model_Sub)
obb_error = rbind(obb_error,data.table(iter,obb_error_func(model_Sub)),use.names=FALSE)
# test the model on test set
test_predictions_Sub = get_test_predictions(model_Sub,test_set,error_type)
predictedLabels_Sub = test_predictions_Sub[[1]]
setnames(predictedLabels_Sub,c("pred_output",error_type), c(paste0("pred_output_",iter),paste0(error_type,"_",iter)))
predictedLabels_table = cbind(predictedLabels_table,predictedLabels_Sub[,.SD, .SDcols = c(paste0("pred_output_",iter),paste0(error_type,"_",iter))])
# Keep test set error records
performance_table = rbind(performance_table,data.table(iter,test_predictions_Sub[[2]]), use.names = FALSE)
if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached.
## sample selection from unlabeled data select candidates
unlabeled_set <- copy(unlabeled_pool)
train_candidates = sample_selection(selected_ins, unlabeled_set, model_Sub)
# eliminate candidates from the unlabeled pool
unlabeled_pool = unlabeled_pool[-train_candidates$idx]
rm(unlabeled_set)
# run ABM to find outputs of train candidates
print(paste0("ABM train_candidate run start time : ",Sys.time()))
train_candidates = run_ABM(nofrep, selected_ins, train_candidates)
print(paste0("ABM train_candidate run end time : ",Sys.time()))
train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,iter = iter))
# add labeled candidates to the train data
training_set_Ad = rbind(training_set_Ad, train_candidates[, -c("idx")])
}
iter = iter + 1
}
print(paste0("section end time : ",Sys.time()))
# Final records
FinalTrainData_Ad = copy(training_set_Ad)
performance_table_Ad = copy(performance_table)
train_candidates_table_Ad = copy(train_candidates_table)
predictedLabels_table_Ad = copy(predictedLabels_table)
obb_error_Ad = copy(obb_error)
#fwrite(FinalTrainData_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/FinalTrainData_Ad_BasicCode_",Sys.Date(),".csv") )
#fwrite(performance_table_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/performance_table_Ad_BasicCode_",Sys.Date(),".csv") )
#fwrite(train_candidates_table_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/train_candidates_table_Ad_BasicCode_",Sys.Date(),".csv") )
#fwrite(predictedLabels_table_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/predictedLabels_table_Ad_BasicCode_",Sys.Date(),".csv") )
#fwrite(obb_error_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/obb_error_Ad_BasicCode_",Sys.Date(),".csv") )
nrow(FinalTrainData_Ad)
performance_table_Ad
train_candidates_table_Ad
head(predictedLabels_table_Ad)
obb_error_Ad
performance_molten_Ad <- melt(data = performance_table_Ad
, id.vars = 'iter')
setnames(performance_molten_Ad, c("variable","value"),c("errortype","errorvalue"))
p_Ad = ggplot(performance_molten_Ad, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) +
geom_line(lwd=1)
p_Ad
grid.arrange(p_Rd, p_Ad, ncol=2)
performance_molten_oneshot <- melt(data = performance_table_oneshot
, id.vars = 'iter')
setnames(performance_molten_oneshot, c("variable","value"),c("errortype","errorvalue"))
performance_Rd_vs_Ad = rbind(performance_molten_Rd[,.(iter,errortype,errorvalue, type = "Rd")],performance_molten_Ad[,.(iter,errortype,errorvalue, type = "Ad")])
p_Rd_vs_Ad = ggplot(performance_Rd_vs_Ad, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) +
geom_line(lwd=1) +
geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
facet_wrap(~type)
p_Rd_vs_Ad
ggplotly(p_Rd_vs_Ad)
comp = performance_Rd_vs_Ad[iter == 20 & errortype =="rmse"]
comp[, oneshot_error := performance_molten_oneshot[errortype =="rmse"]$errorvalue]
comp[,diff := (errorvalue - oneshot_error) ]
comp[,diff_perc := (errorvalue - oneshot_error) / oneshot_error ]
comp
Select a relatively big data pool ( nofinstances should be medium) , like 400
training_set_Ad = copy(adaptive_initial_data)
#if(GenerateTTData == 1){
#
# LHSample_Ad = as.data.table(maximinLHS(n = train_ins_Ad, k = nofparams, dup = 5))
#
# LHSample_Ad$V1 = qunif(LHSample_Ad$V1, 10, 90)
# LHSample_Ad$V2 = qunif(LHSample_Ad$V2, 10, 90)
# setnames(LHSample_Ad, c("V1","V2"), feature_names)
# LHSample_Ad$output <- 0.00
#
# paste0("ABM run start time : ",Sys.time())
# LHSample_Ad = run_ABM(nofrep,train_ins_Ad,LHSample_Ad) %>% as.data.table()
# paste0("ABM run end time : ",Sys.time())
#
# fwrite(LHSample_Ad, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data",Sys.Date(),".csv"))
#
#}else{
# LHSample_Ad <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data_04122019.csv")
# LHSample_Ad <- head(LHSample_Ad[`%-similar-wanted` < 90 & `density` < 90],200)
#
#}
pca_training_set_Ad <- princomp(training_set_Ad[,-c("output")], cor = TRUE, scores = TRUE)
#fviz_pca_ind(pca_LHSample,
# col.ind = "cos2", # Color by the quality of representation
# gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
# geom="point"
# )
pca_training_set_Ad_components <- get_pca_ind(pca_training_set_Ad)
pca_training_set_Ad_components <-cbind(pca_training_set_Ad_components$coord[,1:2],training_set_Ad[,c("output")])
p_training_set_Ad <- ggplot(data = pca_training_set_Ad_components, aes(x = Dim.1, y = Dim.2)) +
geom_point(aes(colour = output)) +
labs( title = "", legend = "output")
p_training_set_Ad
# Decide on strategy:
sample_selection_iteration_order = c(1:19)
feature_elimination_iteration_order = c(19)
#iteration_budget = 3 # should be > max(max(sample_selection_iteration_order),max(feature_elimination_iteration_order))
#h = 1 # specify how many variable will be eliminated in each elimination iteration
feature_names
## initialize record tables Record train candidates
train_candidates_table = data.table()
# Record model performances
performance_table = data.table(iter = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())
# Record obb_error table
obb_error = data.table(iter = numeric(), obb_error = numeric())
# Record iteration history
iteration_history = data.table(iter_no = numeric(), IsFeatureEliminated = logical(), IsDataSelected = logical())
## initialize variables
# keep test set undistorted
predictedLabels_table = copy(test_set)
# specify variables(columns) to be used initialize
columns_left = feature_names
total_numof_eliminated_vars <- 0
print(paste0("section start time : ",Sys.time()))
iter = 1
while (iter <= iteration_budget) {
trainx = training_set_Ad[, .SD, .SDcols = columns_left]
trainy = training_set_Ad$output
# Train the model
model_Sub <- randomForest(x = trainx, y = trainy, importance = TRUE, ntree = ntree, mtry = mtry)
assign(paste0("model_Sub_", iter), model_Sub)
if (length(columns_left) == length(feature_names)) {
ranked_features = get_variable_importance(model_Sub)
}
# Keep training set error records
obb_error = rbind(obb_error, data.table(iter, obb_error_func(model_Sub)), use.names = FALSE)
# Test the model on test set
test_predictions_Sub = get_test_predictions(model_Sub, test_set, error_type)
predictedLabels_Sub = test_predictions_Sub[[1]]
setnames(predictedLabels_Sub, c("pred_output", error_type), c(paste0("pred_output_", iter), paste0(error_type, "_", iter)))
predictedLabels_table = cbind(predictedLabels_table, predictedLabels_Sub[,.SD, .SDcols = c(paste0("pred_output_", iter), paste0(error_type, "_", iter))])
# Keep test set error records
performance_table = rbind(performance_table, data.table(iter, test_predictions_Sub[[2]]), use.names = FALSE)
# update iteration_history
iteration_history = rbind(iteration_history, data.table(iter, 0, 0), use.names = FALSE)
if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached.
if (iter %in% sample_selection_iteration_order) {
## sample selection from unlabeled data select candidates
unlabeled_set <- copy(unlabeled_pool)
train_candidates = sample_selection(selected_ins, unlabeled_set, model_Sub)
# eliminate candidates from the unlabeled pool
unlabeled_pool = unlabeled_pool[-train_candidates$idx]
rm(unlabeled_set)
# run ABM to find outputs of train candidates
print(paste0("ABM train_candidate run start time : ",Sys.time()))
train_candidates = run_ABM(nofrep, selected_ins, train_candidates)
print(paste0("ABM train_candidate run end time : ",Sys.time()))
train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,iter = iter))
# add labeled candidates to the train data
training_set_Ad = rbind(training_set_Ad, train_candidates[, -c("idx")])
# update iteration_history
iteration_history[iter]$IsDataSelected= 1
}
if (iter %in% feature_elimination_iteration_order) {
## feature elimination apply feature elimination
feature_elimination_result = feature_elimination(h, total_numof_eliminated_vars, ranked_features)
columns_left = feature_elimination_result[[1]] #
eliminated_columns = feature_elimination_result[[4]] # not necessary
total_numof_eliminated_vars = as.numeric(feature_elimination_result[2])
numof_eliminated_vars = as.numeric(feature_elimination_result[3]) # not necessary
# update iteration_history
iteration_history[iter]$IsFeatureEliminated= 1
}
}
iter = iter + 1
}
print(paste0("section end time : ",Sys.time()))
950 runs: "section start time : 2020-01-08 21:46:52" // "section end time : 2020-01-08 22:02:30"
#performance_error_table = performance_table[,.SD,.SDcols = c("iter",tolower(error_type))]
#setnames(performance_error_table,c("iter","error"))
#performance_error_table[, lag_error := shift(error,1,type = "lag")]
#performance_error_table
performance_error_table[.N]
columns_left
# Final records
FinalTrainData_AdFe = copy(training_set_Ad)
performance_table_AdFe = copy(performance_table)
train_candidates_table_AdFe = copy(train_candidates_table)
predictedLabels_table_AdFe = copy(predictedLabels_table)
obb_error_AdFe = copy(obb_error)
#fwrite(FinalTrainData_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/FinalTrainData_AdFe_BasicCode_",Sys.Date(),".csv") )
#fwrite(performance_table_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/performance_table_AdFe_BasicCode_",Sys.Date(),".csv") )
#fwrite(train_candidates_table_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/train_candidates_table_AdFe_BasicCode_",Sys.Date(),".csv") )
#fwrite(predictedLabels_table_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/predictedLabels_table_AdFe_BasicCode_",Sys.Date(),".csv") )
#fwrite(obb_error_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/obb_error_AdFe_BasicCode_",Sys.Date(),".csv") )
nrow(FinalTrainData_AdFe)
performance_table_AdFe
train_candidates_table_AdFe
head(predictedLabels_table_AdFe)
obb_error_AdFe
iteration_history
performance_molten_AdFe <- melt(data = performance_table_AdFe
, id.vars = 'iter')
setnames(performance_molten_AdFe, c("variable","value"),c("errortype","errorvalue"))
p_AdFe = ggplot(performance_molten_AdFe, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) +
geom_line(lwd=1) +
geom_vline(xintercept = iteration_history[IsFeatureEliminated==1]$iter_no + 1, linetype = "dashed") +
geom_vline(xintercept = iteration_history[IsDataSelected==1]$iter_no + 1, linetype = "dotdash",color = "yellow")
p_AdFe
grid.arrange(p_Ad, p_AdFe, ncol=2)
performance_Ad_vs_AdFe = rbind(performance_molten_Ad[,.(iter,errortype,errorvalue, type = "Ad")], performance_molten_AdFe[,.(iter,errortype,errorvalue, type = "AdFe")])
p_Ad_vs_AdFe = ggplot(performance_Ad_vs_AdFe, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) +
geom_line(lwd=1) +
geom_vline(xintercept = iteration_history[IsFeatureEliminated==1]$iter_no + 1, linetype = "dashed") +
geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
facet_wrap(~type)
p_Ad_vs_AdFe
ggplotly(p_Ad_vs_AdFe)
#varImpPlot(model_Ad)
NLQuit(nl.obj = nl.model)
#NLQuit(all=FALSE)